
## This R file will: 
## 1 - load up data extracted from the UDD, descriptive graph data, and mean underprod data
## 2 - clean that data as needed 
## 3 - save out the data for further processing

## 1 - load UDD, graph, underprod data
library('dplyr')
library('reldist') 
library('ggplot2')
library('scales')
library('stringr')
library('tidyr')
dataDir = '/data_ext/users/kcz1100/kaylea_dissertation/collabnetXS/processed_data/'
rawDir = '/data_ext/users/kcz1100/kaylea_dissertation/collabnetXS/raw_data/'
outDir = '/data_ext/users/kcz1100/kaylea_dissertation/eseJournal/'
upDF <- read.csv(paste0(rawDir, 'inst_all_packages_full_results.csv'), stringsAsFactors=FALSE)
load(paste0(dataDir, 'maintDF.RData')) #for maintainership spellsDF
load(paste0(dataDir, 'commenterGraphData.RData')) #for network graph data
releaseDF <- read.csv(paste0(dataDir, 'public_packages_data_canonical.tsv'), sep='\t', stringsAsFactors = FALSE)
langAgeDF <- read.csv(paste0(rawDir, 'language_age.tsv'), sep='\t', stringsAsFactors=FALSE)
pattern <- 'implemented-in::(.*)'
qaList <- c('packages@qa.debian.org', 'debian-qa@lists.debian.org', 'frankie@qa.debian.org')
notesDF <- read.csv(paste0(dataDir, 'changelogDates.tsv'), sep='\t', stringsAsFactors = FALSE)
langDF <- read.csv(paste0(dataDir, 'language_props.tsv'), sep='\t', stringsAsFactors = FALSE)
uploadDF <- read.csv(paste0(dataDir, 'public_upload_history_canonical.tsv'), sep='\t', stringsAsFactors=FALSE)

## clean upload
uploadDF$X <- NULL #don't need pandas index
uploadDF$version #don't need
uploadDF$installed_size <- NULL
uploadDF$size <- NULL
uploadDF$md5sum <- NULL
uploadDF$sha256 <- NULL
uploadDF$filename <- NULL
uploadDF[uploadDF=='TODO'] <- NA
uploadDF <- subset(uploadDF, uploadDF$nmu != 't')
uploadDF$package <- uploadDF$source
uploadDF$source <- NULL
uploadDF <- unique(uploadDF)




## additional fields
uploadDF$this_is_qa <- case_when(uploadDF$canonicalMaintainerName %in% qaList ~ 1,
				                                                                  TRUE ~ 0
												                                                                                                    )

uploadDF$this_is_qalist <- case_when(str_detect(uploadDF$canonicalMaintainerName, 'qa@lists') ~ 1,
				                                                                          TRUE ~ 0
													                                                                                                        )

uploadDF$this_is_list <- case_when(str_detect(uploadDF$canonicalMaintainerName, '@lists') ~ 1,
				                                                                      TRUE ~ 0
												                                                                                                          )





##underprod from prior work prep here
upDF$underprod[upDF$up.fac.upper > 0 & upDF$up.fac.lower > 0] <- 1 
upDF$is_underprod[upDF$up.fac.upper > 0 & upDF$up.fac.lower > 0] <- TRUE 
upDF$underprod[upDF$up.fac.upper < 0 & upDF$up.fac.lower < 0] <- 0
upDF$is_underprod[upDF$up.fac.upper < 0 & upDF$up.fac.lower < 0] <- FALSE 
# up.fac is reverse coded lower and upper
upDF$underprod[upDF$up.fac.upper < 0 & upDF$up.fac.lower > 0] <- NA 
upDF$is_underprod[upDF$up.fac.upper < 0 & upDF$up.fac.lower > 0] <- NA
upDF$package <- upDF$pkg
upDF$pkg <- NULL


##release prep here
## parse errors will echo
releaseDF %>% filter(str_detect(package, '::')) 
releaseDF <- releaseDF %>% filter(!str_detect(package, '::'))
releaseDF$architecture <- NULL
releaseDF$size <- NULL
releaseDF$installed_size <- NULL
releaseDF$md5sum <- NULL
releaseDF$section <- NULL
releaseDF$description <- NULL
releaseDF$description_md5 <- NULL
releaseDF$depends <- NULL
releaseDF$recommends <- NULL
releaseDF$suggests <- NULL
releaseDF$enhances <- NULL
releaseDF$pre_depends <- NULL
releaseDF$breaks <- NULL
releaseDF$homepage <- NULL
releaseDF$build_essential <- NULL
releaseDF$origin <- NULL
releaseDF$sha1 <- NULL
releaseDF$replaces <- NULL
releaseDF$python_version <- NULL
releaseDF$ruby_versions <- NULL
releaseDF$provides <- NULL
releaseDF$conflicts <- NULL
releaseDF$sha256 <- NULL
releaseDF$package_type <- NULL
releaseDF$filename <- NULL
releaseDF <- unique(releaseDF)
releaseDF$hasLanguage <- case_when(str_detect(releaseDF$tag, 'implemented') ~ TRUE, TRUE ~ FALSE)
table(releaseDF$hasLanguage)
releaseDF$messyLanguage <- str_match(releaseDF$tag, pattern)[,2] ## quality check
#table(releaseDF$messyLanguage)
releaseDF <- unique(releaseDF)
releaseDF$hasLanguage <- case_when(str_detect(releaseDF$tag, 'implemented') ~ TRUE, TRUE ~ FALSE)
table(releaseDF$hasLanguage)
releaseDF$messyLanguage <- str_match(releaseDF$tag, pattern)[,2]
#table(releaseDF$messyLanguage)
releaseDF <- releaseDF %>% separate(messyLanguage, c('language', 'junk'), sep=",")
releaseDF$junk <- NULL #some will not have anything in this column
releaseDF[releaseDF=='TODO'] <- NA 




##Spellprep here

cutpoint <- as.POSIXct('2020-07-07 00:00:00', format="%Y-%m-%d %H:%M:%S", tz='UTC')
table(is.na(spellsDF$spellEnd))
spellsDF$spellBegin <- as.POSIXct(spellsDF$spellBegin, format="%Y-%m-%d")
spellsDF$spellEnd <- as.POSIXct(spellsDF$spellEnd, format='%Y-%m-%d')

spellsDF <- subset(spellsDF, spellsDF$spellBegin < cutpoint) #drop too-recent maintainers
spellsDF$censored <- is.na(spellsDF$spellEnd) 
spellsDF$spellEnd[is.na(spellsDF$spellEnd)] <- cutpoint
spellsDF$spellEnd[(spellsDF$spellEnd > cutpoint)] <- cutpoint
spellsDF$duration <- difftime(as.POSIXct(spellsDF$spellEnd), as.POSIXct(spellsDF$spellBegin), units='days')
range(spellsDF$duration)
spellsDF$numDays <- str_split(as.character(spellsDF$duration), ' ')
spellsDF$numDays <- as.numeric(spellsDF$numDays)

## this treats interrupted maintainership separately, i.e. 500 days, 1 day someone else, original person 500 days would count 3x

maintainerStintDF <- spellsDF %>% group_by(package, maintainer) %>% summarize(
						       maintDuration = sum(numDays),
						       ) 

daysDF <- spellsDF %>% group_by(package) %>% summarize(
						       total.maintDuration = sum(numDays),
						       )

maintainerStintDF <- merge(maintainerStintDF, daysDF, by='package', all.x=TRUE)
maintainerStintDF$marketshare <- maintainerStintDF$maintDuration / maintainerStintDF$total.maintDuration 
summary(maintainerStintDF$marketshare)

maintainerInequalityDF <- maintainerStintDF %>% group_by(package) %>% summarize(
							       hhi.manual = sum(marketshare ^ 2)
							       )


spellSumDF <- spellsDF %>% group_by(package) %>% summarize(
							     numMaintainers = length(unique(maintainer)),
						             maintDuration = sum(numDays), ##will be a sum total
							     numSpells = n(),
							       gini = gini(numDays),
							         avgDuration = maintDuration/numMaintainers
							       ) 




maintBurnoutDF <- spellSumDF %>% group_by(package) %>% summarize(
								     summativeGini = gini(maintDuration)
								     )

releaseSumDF <- releaseDF %>% group_by(package) %>% summarize(
							        numReleases = length(package)
								)



packageDF <- uploadDF %>% group_by(package) %>% summarize(
							  uploaderCount = n_distinct(canonicalUploaderName), ##need to deal with 'noody@nowhere' name!
							  maintainerCount = n_distinct(canonicalMaintainerName),
							  uploadCount = n(),
							  qa_uploads = sum(this_is_qa),
							  qa_list_maint = sum(this_is_qalist),
							  list_maint = sum(this_is_list)
							  )

packageDF$list_maint <- packageDF$list_maint - packageDF$qa_list_maint #because qa@lists is also a list :/

packageDF <- merge(packageDF, spellSumDF, by='package', all.x=TRUE)


#### now handling language separately in python
#langDF <- subset(releaseDF, releaseDF$hasLanguage==TRUE)
#langSumDF <- langDF %>% group_by(package) %>% summarize(flang = first(language, order_by=version), llang=last(language, order_by=version))
#langSumDF.clean <- subset(langSumDF, langSumDF$flang == langSumDF$llang)
#langSumDF.dirty <- subset(langSumDF, langSumDF$flang != langSumDF$llang)

## read in langSumDF from a separate TSV

packageDF <- merge(packageDF, langDF, by='package', all.x=TRUE)
packageDF <- merge(packageDF, upDF, by='package', all.x=TRUE)
packageDF <- subset(packageDF, !is.na(packageDF$is_underprod))

langDF$TODO <- NULL #don't want


notesDF$changelogFirstDate <- as.POSIXct(notesDF$changelogFirstDate)


birthDF <- spellsDF %>% group_by(package) %>% summarise(firstUpload = min(spellBegin))
subset(birthDF, is.na(birthDF$firstUpload)) #check for NAs
birthDF <- merge(birthDF, notesDF, by='package', all.x=TRUE)
birthDF[is.na(birthDF$changelogFirstDate),]$changelogFirstDate <- birthDF[is.na(birthDF$changelogFirstDate),]$firstUpload

birthDF[is.na(birthDF$changelogFirstDate),] #should be empty

firstUpload.debian <- min(birthDF$firstUpload, notesDF$changelogFirstDate, na.rm=TRUE) #first ever upload to debian


dateOfAnalysis <- as.POSIXct('2020-07-07')

birthDF$birthday <- pmin(birthDF$firstUpload, birthDF$changelogFirstDate)
birthDF$daysSinceFirst <- difftime(dateOfAnalysis, birthDF$birthday, unit='days') #how long ago was the package introduced?
birthDF$yearsOld <- birthDF$daysSinceFirst / 365.25  


packageDF <- merge(packageDF, birthDF, by='package', all.x=TRUE) ## data was collected 2020-07-07


## one for each era

#packageDF <- packageDF %>% mutate(Era.1980s = case_when((ada > 0) | (c.. > 0) | (objc > 0) | 
#				  (perl > 0) | (tcl > 0) ~ TRUE,
#                                  (c > 0) | (c.sharp > 0) | (ecmascript > 0) |
#                                  (fortran > 0) | (haskell > 0) | (java > 0)  |
#                                  (lisp > 0) | (lua > 0) | (ml > 0)  |
#                                  (ocaml > 0) | (pascal > 0)  | (php > 0) |
# (pike > 0)  | (python > 0)  | (r > 0)  |
 #                                 (scheme >  0) | (shell > 0)  | (vala > 0) ~ FALSE,
#				    TRUE ~ NA))

#packageDF <- packageDF %>% mutate(Era.1970s = case_when(c > 0 | fortran > 0 | lisp > 0 |
#                                  ml > 0 | pascal > 0 | scheme > 0 | shell > 0 ~ TRUE,
#				ada > 0 | c.sharp > 0 | c.. > 0 | ecmascript > 0 |
#					haskell > 0 | java > 0 | lua > 0 | objc > 0 | 
#					ocaml > 0 | perl > 0 | php > 0 | pike > 0 | python > 0 | 
#					r > 0 | tcl > 0 | vala > 0 ~ FALSE,
#				    TRUE ~ NA))

#packageDF <- packageDF %>% mutate(Era.1990s = case_when(c.sharp > 0 | ecmascript > 0 |
#                                  haskell > 0 | java > 0 | lua > 0 | ocaml > 0 |
#                                  php > 0 | pike > 0 | python > 0 |
#                                  r > 0 | vala > 0 ~ TRUE,
#				ada > 0 | c > 0 | c.. > 0 | fortran > 0 | lisp > 0 |
#                                  ml > 0 | objc > 0 | pascal > 0 | perl > 0 |
#                                  scheme > 0 | shell > 0 | tcl > 0 ~ FALSE,
#				    TRUE ~ NA))


#packageDF <- merge(packageDF, centDF, by="package", all.x=TRUE) ## need to get all graph data into packageDF here; this is centrality, what else do I need? ## cent was in the old closer network
packageDF <- merge(packageDF, centDF.comment, by="package", all.x=TRUE) ## need to get all graph data into packageDF here; this is centrality, what else do I need?

#packageDF<- packageDF %>% mutate(maintainership_type=case_when(((uploaderCount == 1) & (list_maint == 0)) ~ 'Solo', 
#					     (uploaderCount == 1 & list_maint > 0) ~ 'Team Affiliated Solo', 
#					     (uploaderCount > 1 & list_maint > 0) ~ 'Team', 
#					     ((uploaderCount > 1) & (list_maint == 0)) ~ 'Loose', 
#					     TRUE ~ 'Other') 

			#)


#table(packageDF$maintainership_type)
#t <- subset(packageDF, packageDF$maintainership_type == 'Other')
#print("Do any cases fall through? Your unresolved maintainership types are:")
#
head(t) ##should be blank


dim(packageDF)
packageDF <- merge(packageDF, maintainerInequalityDF, by='package', all.x=TRUE)
dim(packageDF)

## now load in package-specific network data

load(paste0(outDir, 'packageNetSummary.RData'))

packageDF <- merge(packageDF, pkgNetDF, by='package', all.x=TRUE)
dim(packageDF)





save(projOnly.comment, centDF.comment, compDF.comment, langAgeDF, langDF, packageDF, maintBurnoutDF, releaseDF, uploadDF, upDF, spellsDF, releaseSumDF, spellSumDF, file=paste0(outDir,'fullDataset.RData'), version=2) 
save(packageDF, upDF, releaseDF, spellsDF, spellSumDF, releaseSumDF, releaseDF, langDF, file=paste0(outDir,'EDADataset.RData'), version=2) 


